In [2]:
import pandas as pd
import numpy as np
import warnings
import sys

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sktime.utils.plotting import plot_series, plot_lags, plot_correlations
#from visuals import *


#config to clean up the notebook
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
In [3]:
#read the data and parse
df = pd.read_csv('sales_clean.csv')
df = df.set_index(['1', '2']).sort_index()
df.head()
df_sales = df[df['0'] == 'sales']
df_onpromotion = df[df['0'] == 'onpromotion']
df_sales.drop('0', axis=1, inplace=True)
df_sales.index.rename(['family', 'date'], level=[0,1], inplace=True)
df_sales = df_sales.unstack('family')
df_sales.columns = df_sales.columns.droplevel()
df_sales.head()

df_onpromotion.drop('0', axis=1, inplace=True)
df_onpromotion.index.rename(['family', 'date'], level=[0,1], inplace=True)
df_onpromotion = df_onpromotion.unstack('family')
df_onpromotion.columns = df_onpromotion.columns.droplevel()
In [4]:
#parse dates
df_sales.index = pd.to_datetime(df_sales.index)
df_onpromotion.index = pd.to_datetime(df_onpromotion.index)
In [5]:
from sktime.forecasting.model_selection import SlidingWindowSplitter

#separate train and test
y_train = df_sales.iloc[:-15]
y_test = df_sales.iloc[-15:]
In [6]:
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.base import ForecastingHorizon

fh = ForecastingHorizon(y_test.index, is_relative=False)

forecaster = NaiveForecaster(strategy='last', sp=7)

forecaster.fit(y_train)

y_pred = forecaster.predict(fh)
In [7]:
from sktime.performance_metrics.forecasting import MeanSquaredError

rmse = MeanSquaredError(square_root=True)

for col in y_train.columns:
    plot_series(y_train[col], y_test[col], y_pred[col], labels=['y_train', 'y_test', 'y_pred'])
    print('RMSE for {}:    {}'.format(col, rmse(y_test[col], y_pred[col])))
print('Overall RMSE {}'.format(rmse(y_test,y_pred)))
RMSE for AUTOMOTIVE:    1.358381001891627
RMSE for BABY CARE:    0.0969352019679599
RMSE for BEAUTY:    1.6163742516734927
RMSE for BEVERAGES:    629.8133697253157
RMSE for BOOKS:    0.02191140676732531
RMSE for BREAD/BAKERY:    54.638165398252255
RMSE for CELEBRATION:    2.3688554957858448
RMSE for CLEANING:    581.5756684629596
RMSE for DAIRY:    95.19699760691029
RMSE for DELI:    41.212282499624564
RMSE for EGGS:    20.67250901534757
RMSE for FROZEN FOODS:    12.506054721096808
RMSE for GROCERY I:    791.7376935103116
RMSE for GROCERY II:    11.5325489717349
RMSE for HARDWARE:    0.19580651176327848
RMSE for HOME AND KITCHEN I:    5.854356140667163
RMSE for HOME AND KITCHEN II:    7.427266496125442
RMSE for HOME APPLIANCES:    0.15180667666099854
RMSE for HOME CARE:    78.19120871893784
RMSE for LADIESWEAR:    1.791976582463979
RMSE for LAWN AND GARDEN:    1.8161242622125413
RMSE for LINGERIE:    2.289471234711228
RMSE for LIQUOR,WINE,BEER:    22.58739890864535
RMSE for MAGAZINES:    2.4666501000439687
RMSE for MEATS:    37.824619704418026
RMSE for PERSONAL CARE:    76.6610270107757
RMSE for PET SUPPLIES:    1.4669191549337932
RMSE for PLAYERS AND ELECTRONICS:    1.7508374533172635
RMSE for POULTRY:    45.92946889127437
RMSE for PREPARED FOODS:    6.923388492262947
RMSE for PRODUCE:    227.95699803772067
RMSE for SCHOOL AND OFFICE SUPPLIES:    47.81117891504797
RMSE for SEAFOOD:    3.299214022617051
Overall RMSE 85.35586256315877

Our first predictions¶

Here we have the first and simplest iteration of a predictor, using a NaiveForecaster with a seasonal period of 7 and a strategy of "last", which of means that it takes into account the previous 7 values when making its prediction. This parameter was chosen to capture some of the weekly seasonality we observed in the data.

Metric Selection¶

Here I have chosen RMSE or root mean squared error as the metric for evaluation. Advantages of this metric are the results are in the same unit as the target variable, which makes for good explainability downstream. It is also robust against 0 values, where something like MAPE or mean absolute percentage error is not. This could cause problems for us because some of the families like books for example, have 0 values in the ground truth for the test set.

Evaluating the results¶

We can see the results both visually and observe the RMSE. This will give us a baseline to evaluate our future iterations of modeling and prediction. Since RMSE gives us a value in the same unit as our target variable, it represents an error in sales units, so lower will be better.

Model Cross-Validation¶

It should be noted however, that in order to actually select the model that generalizes the best to unseen data, we should implement a cross-validation strategy that will iteratively fit models and then test on portions of our training set. We can record the RMSE for each of these iterations and then take an average to see how well it does. While this is not a concept specific to time-series analysis, some special consideration needs to be applied to the splitting process when dealing with a time series.

The most important thing is we need to prevent data leakage, and we need to have consecutive dates in our training data.

We will be using a k-fold cv strategy that uses and expanding window strategy. This would look something like this for 5 folds (k=5)

  • signifies training data * signifies test data

fold 0 : ++++++++***
fold 1 : ++++++++++++++++***
fold 2 : ++++++++++++++++++++++++***
fold 3 : ++++++++++++++++++++++++++++++++***
fold 4 : ++++++++++++++++++++++++++++++++++++++++***

In [8]:
from sklearn.model_selection import TimeSeriesSplit, KFold

#runs k fold cross validation and returns the mean of the error metric
def ts_model_cv(y_train, forecaster=None, cv=None, fh=None, metric=None, fit_fh=False):

   folds = list(cv.split_loc(y_train))
   results = []
   for n in range(cv.get_n_splits(y_train)):
      train = y_train.loc[folds[n][0]]
      test = y_train.loc[folds[n][1]]
      if fit_fh:
         forecaster.fit(train, fh=fh)
      else:
         forecaster.fit(train)
      y_pred = forecaster.predict(fh)
      results.append(metric(test,y_pred))
      print('Fold {}: {}'.format(n, metric(test,y_pred)))
   return np.mean(results)
Interpreting the Results¶

Here we can see that our RMSE across the 50 fold cross validation is a bit higher than the RMSE on our holdout test set. In real life we aren't going to have the ground truth values for the period we are interested in forecasting, so this cross validation performance will be a much more reliable metric to use to measure our models performance than comparing performance on the test set.

lets try a few more models

In [71]:
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.trend import STLForecaster
from sktime.forecasting.compose import DirRecTimeSeriesRegressionForecaster, make_reduction
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sktime.forecasting.model_selection import ExpandingWindowSplitter

model_results = {}
fh = ForecastingHorizon(np.arange(1,16))
cv = ExpandingWindowSplitter(fh=fh, initial_window=42, step_length=15)
metric = MeanSquaredError(square_root=True)

models = [
        NaiveForecaster(strategy='last', sp=7),
        ExponentialSmoothing(trend='add', seasonal='add', sp=7),
        make_reduction(LGBMRegressor(max_depth=6, n_estimators=20), window_length=13),
        make_reduction(XGBRegressor(max_depth=6, n_estimators=20), window_length=13),
        make_reduction(LGBMRegressor(max_depth=7, n_estimators=52), window_length=20),
        make_reduction(XGBRegressor(max_depth=7, n_estimators=52), window_length=20),
        STLForecaster(seasonal=7, sp=7, trend=55, robust=True)
]

for model in models:
    model_results[str(model)] = ts_model_cv(y_train, forecaster=model, cv=cv, fh=fh, metric=metric)
Fold 0: 37.97236832813313
Fold 1: 68.93214605435246
Fold 2: 60.68519663159952
Fold 3: 86.96183638161638
Fold 4: 46.24662379946465
Fold 5: 78.33349162239456
Fold 6: 80.5210725967284
Fold 7: 81.44234255104284
Fold 8: 117.94604922837367
Fold 9: 66.11299487109609
Fold 10: 133.64575259066103
Fold 11: 172.95062789835293
Fold 12: 89.18719274300629
Fold 13: 76.48868387445403
Fold 14: 89.84934450144004
Fold 15: 53.33359616900093
Fold 16: 65.80589363869711
Fold 17: 80.87042430553993
Fold 18: 166.8559483745427
Fold 19: 204.39814922796813
Fold 20: 111.7448267877748
Fold 21: 69.52191758920553
Fold 22: 108.83709051664113
Fold 23: 69.04455017278157
Fold 24: 82.17249083547424
Fold 25: 47.10083625735518
Fold 26: 72.11365612545468
Fold 27: 77.11414288764338
Fold 28: 67.16779518424063
Fold 29: 63.752811492398095
Fold 30: 87.05450818576432
Fold 31: 78.27133412838076
Fold 32: 92.54020293423127
Fold 33: 58.31296196526873
Fold 34: 101.58889851031057
Fold 35: 176.18689424808724
Fold 36: 176.96400678100608
Fold 37: 45.788669811126056
Fold 38: 68.01896850372151
Fold 39: 100.56513266894753
Fold 40: 122.113330644381
Fold 41: 74.07232940380925
Fold 42: 124.90036549123342
Fold 43: 100.44341137910979
Fold 44: 122.71145211072066
Fold 45: 80.44080776359165
Fold 46: 84.46470759682114
Fold 47: 64.76880290139374
Fold 48: 71.60745835880164
Fold 49: 37.176850502731796
Fold 0: 29.520746624005643
Fold 1: 77.38277807391059
Fold 2: 28.18263042447652
Fold 3: 93.64629486456772
Fold 4: 26.239234462422804
Fold 5: 92.03833151564399
Fold 6: 36.63183165247759
Fold 7: 78.79703468272395
Fold 8: 37.68917663999696
Fold 9: 77.03111126716091
Fold 10: 130.1425494688859
Fold 11: 233.95771982372906
Fold 12: 47.935395869583886
Fold 13: 72.38716341664438
Fold 14: 176.57668292536243
Fold 15: 61.346521600382744
Fold 16: 38.07538105523951
Fold 17: 82.96647985963796
Fold 18: 170.94764885929675
Fold 19: 91.53727594698718
Fold 20: 78.27666296993834
Fold 21: 86.31406500920409
Fold 22: 53.66790033082141
Fold 23: 82.89120101287111
Fold 24: 42.71561625683541
Fold 25: 73.60212007860966
Fold 26: 57.91128372358087
Fold 27: 93.29264917042549
Fold 28: 67.52679156695561
Fold 29: 102.51397590660393
Fold 30: 59.01250017656453
Fold 31: 102.19646514301262
Fold 32: 72.04392509152478
Fold 33: 76.68025907516328
Fold 34: 81.44761273780234
Fold 35: 158.2848985823557
Fold 36: 222.8956078592562
Fold 37: 62.113231931323476
Fold 38: 56.49459234500989
Fold 39: 111.9937208469373
Fold 40: 55.15988784185965
Fold 41: 94.70242676391948
Fold 42: 172.0613042572896
Fold 43: 118.87789869543543
Fold 44: 115.64575908173587
Fold 45: 58.28291553876118
Fold 46: 58.5509149964444
Fold 47: 58.1621718206514
Fold 48: 62.56538080236745
Fold 49: 55.10369909826481
Fold 0: 74.7241338232219
Fold 1: 71.23601438530027
Fold 2: 66.63455470883827
Fold 3: 81.31840240502846
Fold 4: 62.6072531002815
Fold 5: 92.08254470155336
Fold 6: 72.33090906858212
Fold 7: 81.90122536641626
Fold 8: 81.24375761631416
Fold 9: 65.44658962687654
Fold 10: 132.31103926613042
Fold 11: 135.17313070592164
Fold 12: 106.21952642819954
Fold 13: 66.65829159466217
Fold 14: 63.89180601901875
Fold 15: 57.194043271569456
Fold 16: 45.079816552244
Fold 17: 78.8781692064243
Fold 18: 152.98739336405686
Fold 19: 104.51410604957034
Fold 20: 55.57375753747167
Fold 21: 68.86889440710594
Fold 22: 60.946419384013915
Fold 23: 60.56819290331203
Fold 24: 50.39617491499451
Fold 25: 49.51877088494729
Fold 26: 54.187461715142284
Fold 27: 67.22592567035016
Fold 28: 62.68097260730285
Fold 29: 74.49255598120838
Fold 30: 56.88353558784503
Fold 31: 68.32430332298541
Fold 32: 77.29166063726521
Fold 33: 61.286011379945
Fold 34: 85.49029439063229
Fold 35: 174.59067102770402
Fold 36: 156.11868313614562
Fold 37: 60.05537461597494
Fold 38: 62.925072270499435
Fold 39: 80.75379853733297
Fold 40: 107.21236415426955
Fold 41: 88.92610026085063
Fold 42: 56.698931541151204
Fold 43: 97.55989882466083
Fold 44: 81.02255816025968
Fold 45: 70.31098660349949
Fold 46: 84.29467490326196
Fold 47: 63.25185730414387
Fold 48: 49.6453813602413
Fold 49: 41.36082625388983
Fold 0: 54.80977996214783
Fold 1: 66.23211392656316
Fold 2: 72.2894523318756
Fold 3: 76.31182482161833
Fold 4: 60.70168366326003
Fold 5: 82.82278770195276
Fold 6: 81.18075431811091
Fold 7: 83.94548427185494
Fold 8: 59.91191263730318
Fold 9: 66.86877060094046
Fold 10: 138.93003848453117
Fold 11: 144.05632697882774
Fold 12: 140.2832849005002
Fold 13: 72.48750029582936
Fold 14: 65.04137204860987
Fold 15: 54.246004823641506
Fold 16: 38.39295871101111
Fold 17: 80.57246287814787
Fold 18: 142.40694909429342
Fold 19: 105.55344041317188
Fold 20: 52.87394660616728
Fold 21: 81.07370899392104
Fold 22: 59.03501016620012
Fold 23: 66.91065624552441
Fold 24: 67.35960167400195
Fold 25: 50.94312919641326
Fold 26: 67.20640071419886
Fold 27: 74.98417981281197
Fold 28: 71.83934976093245
Fold 29: 71.08798442624683
Fold 30: 57.17865347128641
Fold 31: 76.48276559216643
Fold 32: 74.01821860382246
Fold 33: 64.35079057516303
Fold 34: 82.55389049289523
Fold 35: 157.43813112767558
Fold 36: 144.2709960730875
Fold 37: 60.769067098626216
Fold 38: 68.83745588726782
Fold 39: 98.16225211392236
Fold 40: 122.46262493677085
Fold 41: 87.18926440504788
Fold 42: 66.64864820475837
Fold 43: 95.36291194766126
Fold 44: 88.55602614487533
Fold 45: 81.25343108754834
Fold 46: 83.73115020289495
Fold 47: 63.3602815917445
Fold 48: 53.1998895702403
Fold 49: 43.430686456962995
Fold 0: 75.87262486322734
Fold 1: 89.29492883328649
Fold 2: 61.795979134773695
Fold 3: 80.04508014566929
Fold 4: 52.304854663340905
Fold 5: 86.24236861589675
Fold 6: 68.33834465441409
Fold 7: 83.07370033130637
Fold 8: 60.64937504007555
Fold 9: 66.67362788351076
Fold 10: 140.39551318309987
Fold 11: 156.70916179639005
Fold 12: 115.00544407846951
Fold 13: 71.50856777018488
Fold 14: 58.66160511240305
Fold 15: 59.90980308863825
Fold 16: 50.2553935353961
Fold 17: 73.78761346452156
Fold 18: 160.23438882421442
Fold 19: 90.7500439614552
Fold 20: 54.9106883378124
Fold 21: 71.41695981441958
Fold 22: 49.79010312686895
Fold 23: 55.05599156184801
Fold 24: 43.754688833490974
Fold 25: 42.342474183595655
Fold 26: 61.37270356637474
Fold 27: 61.145761960280495
Fold 28: 45.456437471624
Fold 29: 66.01810781676637
Fold 30: 59.051019822677326
Fold 31: 66.81555220449304
Fold 32: 57.44948075352865
Fold 33: 61.3870152045166
Fold 34: 70.26811145540219
Fold 35: 151.05527266332743
Fold 36: 175.13546543353578
Fold 37: 58.610652691917196
Fold 38: 43.72537689800849
Fold 39: 97.54184835742853
Fold 40: 92.8920116290089
Fold 41: 77.76591884226083
Fold 42: 72.49875029910919
Fold 43: 90.22605275160498
Fold 44: 82.24746809083683
Fold 45: 69.60558216573412
Fold 46: 72.9060268035643
Fold 47: 49.81222869963842
Fold 48: 55.361699518130116
Fold 49: 42.91537781247247
Fold 0: 64.08348839945558
Fold 1: 62.41399045668362
Fold 2: 55.352235584934284
Fold 3: 78.90407020430737
Fold 4: 57.24300443565902
Fold 5: 85.10189606126877
Fold 6: 74.30193149173994
Fold 7: 81.14215930114118
Fold 8: 63.886711675113986
Fold 9: 58.01137133788293
Fold 10: 132.7209903168583
Fold 11: 155.30380334950496
Fold 12: 118.39304148427108
Fold 13: 79.3487223036532
Fold 14: 54.394916572427
Fold 15: 62.06390775530357
Fold 16: 63.32431946326353
Fold 17: 79.1133250633088
Fold 18: 155.76178568520436
Fold 19: 161.1049141065568
Fold 20: 69.76371134623182
Fold 21: 65.21655268536509
Fold 22: 61.67626029653103
Fold 23: 48.849048725241516
Fold 24: 36.06435461368767
Fold 25: 45.93507096157189
Fold 26: 70.45862300131276
Fold 27: 69.42887352014021
Fold 28: 74.59202434679663
Fold 29: 68.51354280753789
Fold 30: 56.683422841621685
Fold 31: 76.34559481892062
Fold 32: 63.00039682659467
Fold 33: 63.4382082722158
Fold 34: 66.41031246753414
Fold 35: 157.55467880064728
Fold 36: 132.53087240448096
Fold 37: 66.52479263461917
Fold 38: 66.86281810708691
Fold 39: 104.21202351169165
Fold 40: 96.662992494875
Fold 41: 76.18482527955891
Fold 42: 74.71108640808019
Fold 43: 87.52203493881241
Fold 44: 97.35124346753673
Fold 45: 66.73063691999512
Fold 46: 74.74907645945537
Fold 47: 72.63438819171414
Fold 48: 50.946948625742756
Fold 49: 48.92718854767634
Fold 0: 37.56722316734588
Fold 1: 67.28667473621381
Fold 2: 48.89007328801562
Fold 3: 80.44811088316418
Fold 4: 43.543583904051665
Fold 5: 76.40930508467474
Fold 6: 61.949251209966945
Fold 7: 80.27774328064156
Fold 8: 55.875642574761315
Fold 9: 59.55086317850294
Fold 10: 133.88787415684735
Fold 11: 125.42926609900331
Fold 12: 178.65429061321782
Fold 13: 76.27066796790204
Fold 14: 73.52108728506688
Fold 15: 56.15489153559924
Fold 16: 56.12818621406225
Fold 17: 78.79032177013453
Fold 18: 176.39439090655947
Fold 19: 87.58101808117378
Fold 20: 116.86807871845876
Fold 21: 67.18183667835213
Fold 22: 65.79165378614249
Fold 23: 68.5102177478987
Fold 24: 63.14766066366037
Fold 25: 45.58302707813617
Fold 26: 61.238648608738856
Fold 27: 69.96788855934953
Fold 28: 58.169805400545116
Fold 29: 62.08206193484601
Fold 30: 68.3185424970567
Fold 31: 76.13357758251533
Fold 32: 74.61259740503112
Fold 33: 64.55988984992439
Fold 34: 86.19734606649563
Fold 35: 171.11781752749738
Fold 36: 147.40297007475485
Fold 37: 66.80340693793572
Fold 38: 66.5159005835431
Fold 39: 90.16228599256632
Fold 40: 100.73270841391026
Fold 41: 72.841537474245
Fold 42: 73.89008583018811
Fold 43: 93.54958590809923
Fold 44: 57.8561070080154
Fold 45: 77.55928621936887
Fold 46: 70.4966346718921
Fold 47: 70.00761426596267
Fold 48: 69.20299437009729
Fold 49: 34.868912907962894
In [72]:
print(model_results)
{'NaiveForecaster(sp=7)': 89.90201894253744, "ExponentialSmoothing(seasonal='add', sp=7, trend='add')": 85.48038855489321, 'RecursiveTabularRegressionForecaster(estimator=LGBMRegressor(max_depth=6,\n                                                             n_estimators=20),\n                                     window_length=13)': 78.41789635077235, 'RecursiveTabularRegressionForecaster(estimator=XGBRegressor(base_score=None,\n                                                            booster=None,\n                                                            callbacks=None,\n                                                            colsample_bylevel=None,\n                                                            colsample_bynode=None,\n                                                            colsample_bytree=None,\n                                                            early_stopping_rounds=None,\n                                                            enable_categorical=False,\n                                                            eval_metric=None,\n                                                            feature_types=None,\n                                                            gamma=None,\n                                                            gpu_id=None,\n                                                            grow_policy=None,\n                                                            importance_type=None,\n                                                            interaction_constraints=None,\n                                                            learning_rate=None,\n                                                            max_bin=None,\n                                                            max_cat_threshold=None,\n                                                            max_cat_to_onehot=None,\n                                                            max_delta_step=None,\n                                                            max_depth=6,\n                                                            max_leaves=None,\n                                                            min_child_weight=None,\n                                                            missing=nan,\n                                                            monotone_constraints=None,\n                                                            n_estimators=20,\n                                                            n_jobs=None,\n                                                            num_parallel_tree=None,\n                                                            predictor=None,\n                                                            random_state=None, ...),\n                                     window_length=13)': 80.39232012090055, 'RecursiveTabularRegressionForecaster(estimator=LGBMRegressor(max_depth=7,\n                                                             n_estimators=52),\n                                     window_length=20)': 76.00086495501101, 'RecursiveTabularRegressionForecaster(estimator=XGBRegressor(base_score=None,\n                                                            booster=None,\n                                                            callbacks=None,\n                                                            colsample_bylevel=None,\n                                                            colsample_bynode=None,\n                                                            colsample_bytree=None,\n                                                            early_stopping_rounds=None,\n                                                            enable_categorical=False,\n                                                            eval_metric=None,\n                                                            feature_types=None,\n                                                            gamma=None,\n                                                            gpu_id=None,\n                                                            grow_policy=None,\n                                                            importance_type=None,\n                                                            interaction_constraints=None,\n                                                            learning_rate=None,\n                                                            max_bin=None,\n                                                            max_cat_threshold=None,\n                                                            max_cat_to_onehot=None,\n                                                            max_delta_step=None,\n                                                            max_depth=7,\n                                                            max_leaves=None,\n                                                            min_child_weight=None,\n                                                            missing=nan,\n                                                            monotone_constraints=None,\n                                                            n_estimators=52,\n                                                            n_jobs=None,\n                                                            num_parallel_tree=None,\n                                                            predictor=None,\n                                                            random_state=None, ...),\n                                     window_length=20)': 79.04904378743629, 'STLForecaster(robust=True, sp=7, trend=55)': 79.31962293400193}
In [9]:
from sktime.forecasting.trend import STLForecaster

model = STLForecaster(seasonal=7, sp=7, trend=55, robust=True)
model.fit(y_train)
fh = ForecastingHorizon(y_test.index, is_relative=False)
y_pred = model.predict(fh)
print('RMSE {}'.format(rmse(y_test,y_pred)))
RMSE 75.56295369442584
In [14]:
for col in y_train.columns:
    plot_series(y_train[col], y_test[col], y_pred[col])
Interpreting CV results¶

We can see that from trying a few different models and experiementing with a few hyperparameters for the STL forecaster, we have obtained a lower CV score with the STLforecaster using sp=7 and trend = 55. Recall that in the EDA we found that an sp of 28 smoothed the trend out enough to where enough of the seasonality was beiing caught in the decomposition. It turns out this is actually due to the trend hyperparameter, but by default it is set based off of the sp value. The sp value should be 7 to capture weekly seasonality, so the 55 value was calculated using the formula from the documentation based on the sp=28 value we used during decomposition.

We can also see when we check the rmse vs our hold out test set, the performance is better than with the NaiveForecaster alone. There are many hyperparameters for STLForecaster we could continue to tune, such as using different forecasters within the process to forecast each individual componenet. We can also observe that a large source of error for both models was the huge surge in "SCHOOL AND OFFICE SUPPLIES" that occurred during the test set period.

In [22]:
#lets output these results to CSV to use in our dashboard
y_pred.unstack().to_csv('STLForecaster_pred.csv', index_label=[0,1,2])
In [65]:
from sktime.forecasting.model_selection import ForecastingRandomizedSearchCV
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.compose import make_reduction
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.model_selection import ExpandingWindowSplitter
from lightgbm import LGBMRegressor

rmse = MeanSquaredError(square_root=True)
regressor = LGBMRegressor()
forecaster = make_reduction(regressor)
nested_params = {"window_length": list(range(2,21)), 
                 "estimator__max_depth": list(range(5,16)),
                 "estimator__n_estimators" : list(range(20,120))}
fh = ForecastingHorizon(np.arange(1,16))
cv = ExpandingWindowSplitter(fh=fh, initial_window=42, step_length=15)

nrcv = ForecastingRandomizedSearchCV(forecaster, strategy="refit", cv=cv, 
                                     param_distributions=nested_params, 
                                     n_iter=10, random_state=42, scoring=rmse)
nrcv.fit(y_train)
print(nrcv.best_params_)
print(nrcv.best_score_)
print(nrcv.cv_results_)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[65], line 22
     18 nrcv = ForecastingRandomizedSearchCV(forecaster, strategy="refit", cv=cv, 
     19                                      param_distributions=nested_params, 
     20                                      n_iter=10, random_state=42, scoring=rmse)
     21 nrcv.fit(y_train)
---> 22 print(nrcv.best_params_)
     23 print(nrcv.best_score_)
     24 print(nrcv.cv_results_)

AttributeError: 'ForecastingRandomizedSearchCV' object has no attribute 'best_params_'
In [67]:
y_pred = nrcv.predict(fh=fh)
print(rmse(y_test,y_pred))
69.52885618089292
In [68]:
nrcv.get_fitted_params()
Out[68]:
{'forecasters': family                                              AUTOMOTIVE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                               BABY CARE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                  BEAUTY  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                               BEVERAGES  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                   BOOKS  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                            BREAD/BAKERY  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                             CELEBRATION  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                CLEANING  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                   DAIRY  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                    DELI  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                    EGGS  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                            FROZEN FOODS  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                               GROCERY I  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                              GROCERY II  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                HARDWARE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                      HOME AND KITCHEN I  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                     HOME AND KITCHEN II  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                         HOME APPLIANCES  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                               HOME CARE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                              LADIESWEAR  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                         LAWN AND GARDEN  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                LINGERIE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                        LIQUOR,WINE,BEER  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                               MAGAZINES  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                   MEATS  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                           PERSONAL CARE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                            PET SUPPLIES  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                 PLAYERS AND ELECTRONICS  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                 POULTRY  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                          PREPARED FOODS  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                 PRODUCE  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                              SCHOOL AND OFFICE SUPPLIES  \
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...   
 
 family                                                 SEAFOOD  
 forecasters  ForecastingRandomizedSearchCV(cv=ExpandingWind...  ,
 'forecasters.loc[forecasters,AUTOMOTIVE]': ForecastingRandomizedSearchCV(cv=ExpandingWindowSplitter(fh=ForecastingHorizon([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], dtype='int64', is_relative=True),
                                                          initial_window=42,
                                                          step_length=15),
                               forecaster=RecursiveTabularRegressionForecaster(estimator=LGBMRegressor()),
                               param_distributions={'estimator__max_depth': [5,
                                                                             6,
                                                                             7,
                                                                             8,
                                                                             9,
                                                                             10,
                                                                             11,
                                                                             12,
                                                                             13,
                                                                             14,
                                                                             15],
                                                    'estimator__n_estimators': [20,
                                                                                21,
                                                                                22,
                                                                                23,
                                                                                24,
                                                                                25,
                                                                                26,
                                                                                27,
                                                                                28,
                                                                                29,
                                                                                30,
                                                                                31,
                                                                                32,
                                                                                33,
                                                                                34,
                                                                                35,
                                                                                36,
                                                                                37,
                                                                                38,
                                                                                39,
                                                                                40,
                                                                                41,
                                                                                42,
                                                                                43,
                                                                                44,
                                                                                45,
                                                                                46,
                                                                                47,
                                                                                48,
                                                                                49, ...],
                                                    'window_length': [2, 3, 4, 5,
                                                                      6, 7, 8, 9,
                                                                      10, 11, 12,
                                                                      13, 14, 15,
                                                                      16, 17, 18,
                                                                      19, 20]},
                               random_state=42,
                               scoring=MeanSquaredError(square_root=True)),
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator': LGBMRegressor(max_depth=7, n_estimators=52),
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__transformers': None,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__window_length': 20,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__best_iteration': None,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__best_score': defaultdict(collections.OrderedDict,
             {}),
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__booster': <lightgbm.basic.Booster at 0x7f5409277dc0>,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__evals_result': None,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__feature_importances': array([ 38,  42,  42,  48,  68,  51,  49,  46,  50,  77,  40,  37,  53,
         72,  57,  34,  59,  47,  56, 113], dtype=int32),
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__feature_name': ['Column_0',
  'Column_1',
  'Column_2',
  'Column_3',
  'Column_4',
  'Column_5',
  'Column_6',
  'Column_7',
  'Column_8',
  'Column_9',
  'Column_10',
  'Column_11',
  'Column_12',
  'Column_13',
  'Column_14',
  'Column_15',
  'Column_16',
  'Column_17',
  'Column_18',
  'Column_19'],
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__fitted': True,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__n_features': 20,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__n_features_in': 20,
 'forecasters.loc[forecasters,AUTOMOTIVE]__best_forecaster__estimator__objective': 'regression',
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator': LGBMRegressor(max_depth=7, n_estimators=52),
 'forecasters.loc[forecasters,AUTOMOTIVE]__transformers': None,
 'forecasters.loc[forecasters,AUTOMOTIVE]__window_length': 20,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__best_iteration': None,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__best_score': defaultdict(collections.OrderedDict,
             {}),
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__booster': <lightgbm.basic.Booster at 0x7f5409277dc0>,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__evals_result': None,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__feature_importances': array([ 38,  42,  42,  48,  68,  51,  49,  46,  50,  77,  40,  37,  53,
         72,  57,  34,  59,  47,  56, 113], dtype=int32),
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__feature_name': ['Column_0',
  'Column_1',
  'Column_2',
  'Column_3',
  'Column_4',
  'Column_5',
  'Column_6',
  'Column_7',
  'Column_8',
  'Column_9',
  'Column_10',
  'Column_11',
  'Column_12',
  'Column_13',
  'Column_14',
  'Column_15',
  'Column_16',
  'Column_17',
  'Column_18',
  'Column_19'],
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__fitted': True,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__n_features': 20,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__n_features_in': 20,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__objective': 'regression',
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__n_estimators': 52,
 'forecasters.loc[forecasters,AUTOMOTIVE]__estimator__max_depth': 7}
In [ ]: